library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
library(gganimate)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
movies <- read_csv("data/movies.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   year = col_double(),
##   budget = col_double(),
##   budget_2013 = col_double(),
##   period_code = col_double(),
##   decade_code = col_double(),
##   response = col_logical(),
##   metascore = col_double(),
##   imdb_rating = col_double(),
##   imdb_votes = col_number(),
##   error = col_logical()
## )
## ℹ Use `spec()` for the full column specifications.
raw_bechdel <- read_csv("data/raw_bechdel.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   title = col_character(),
##   year = col_double(),
##   id = col_double(),
##   imdb_id = col_character(),
##   rating = col_double()
## )
movies %>% 
  ggplot() +
  aes(x = binary, y = imdb_rating) +
  geom_boxplot()
## Warning: Removed 202 rows containing non-finite values (stat_boxplot).

subset_movies <- movies %>%
  select(year, title, binary, budget_2013, plot, rated, language, country, writer, metascore, imdb_rating, director, actors, genre, awards, runtime)  %>% 
  mutate(first_country = str_extract(country, pattern = "[A-z ]+")) %>% 
  mutate(first_genre = str_extract(genre, pattern = "[A-z ]+"))

unique(subset_movies$first_genre)
##  [1] NA            "Biography"   "Action"      "Drama"       "Comedy"     
##  [6] "Crime"       "Animation"   "Horror"      "Adventure"   "Mystery"    
## [11] "Fantasy"     "Thriller"    "Documentary" "Sci"         "Musical"    
## [16] "Family"      "Romance"     "Western"

What are some interesting variables/potential relationships?

subset_movies %>% 
  group_by(first_genre) %>% 
  mutate(count = n()) %>% 
  mutate(genre = if_else(count < 100, "Other", first_genre))
## # A tibble: 1,794 x 19
## # Groups:   first_genre [18]
##     year title  binary budget_2013 plot  rated language country writer metascore
##    <dbl> <chr>  <chr>        <dbl> <chr> <chr> <chr>    <chr>   <chr>      <dbl>
##  1  2013 21 &a… FAIL      13000000 <NA>  <NA>  <NA>     <NA>     <NA>         NA
##  2  2012 Dredd… PASS      45658735 <NA>  <NA>  <NA>     <NA>     <NA>         NA
##  3  2013 12 Ye… FAIL      20000000 In t… R     English  USA, UK "John…        97
##  4  2013 2 Guns FAIL      61000000 A DE… R     English… USA     "Blak…        55
##  5  2013 42     FAIL      40000000 The … PG-13 English  USA     "Bria…        62
##  6  2013 47 Ro… FAIL     225000000 A ba… PG-13 English… USA     "Chri…        29
##  7  2013 A Goo… FAIL      92000000 John… R     English… USA     "Skip…        28
##  8  2013 About… PASS      12000000 At t… R     English  UK      "Rich…        55
##  9  2013 Admis… PASS      13000000 A Pr… PG-13 English  USA     "Kare…        48
## 10  2013 After… FAIL     130000000 A cr… PG-13 English  USA     "Gary…        33
## # … with 1,784 more rows, and 9 more variables: imdb_rating <dbl>,
## #   director <chr>, actors <chr>, genre <chr>, awards <chr>, runtime <chr>,
## #   first_country <chr>, first_genre <chr>, count <int>
subset_movies %>% 
  select(year, metascore, imdb_rating, budget_2013, first_genre) %>% 
  drop_na() %>% 
  group_by(first_genre) %>% 
  mutate(count = n()) %>% 
  mutate(genre = if_else(count < 100, "Other", first_genre)) %>% 
  mutate(year = as.integer(year)) %>% 
  ggplot(aes(x = metascore, y = imdb_rating, size = budget_2013, colour = budget_2013)) +
  geom_point(alpha = 0.7, show.legend = FALSE) +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  labs(title = 'Year: {frame_time}', x = 'Metascore', y = 'IMDB rating') +
  transition_time(year) +
  ease_aes('linear')

subset_movies %>% 
  select(year, binary) %>% 
  drop_na() %>% 
  mutate(year = as.integer(year)) %>% 
  ggplot(aes(x = binary, fill = binary)) +
  geom_bar(show.legend = F) +
  theme_minimal() +
  scale_fill_manual(values = c("darkblue", "darkred")) +
  labs(title = 'Year: {frame_time}', x = 'Bechdel test result', y = 'Count') +
  transition_time(year) +
  ease_aes('linear')

subset_movies %>% 
  select(year, binary) %>% 
  drop_na() %>% 
  mutate(year = as.integer(year)) %>%
  filter(year > 1980) %>% 
  group_by(year, binary) %>% 
  summarise(count = n()) %>% 
  pivot_wider(names_from = binary,
              values_from = count) %>% 
  mutate(percentage_Pass= round(PASS/(FAIL+PASS), 3),
         percentage_Fail = round(FAIL/(FAIL+PASS), 3)) %>% 
  pivot_longer(names_to = "test_outcome", 
               values_to = "percentage",
               names_prefix = "percentage_", 
               cols = percentage_Pass:percentage_Fail) %>% 
  mutate(label_position = if_else(test_outcome == "Pass", 0.1, 0.9)) %>% 
  ggplot(aes(x = "", y = percentage, fill = factor(test_outcome))) +
  geom_col(show.legend = F, position = "fill") +
  theme_classic() +
  theme(axis.title = element_text(size = 20, face = "bold"),
        axis.text = element_text(size = 15, face = "bold"),
        plot.title = element_text(size = 30, face = "bold", hjust = -0.2),
        plot.subtitle = element_text(size = 20)) +
  scale_fill_manual(values = c("darkred", "darkblue")) +
    geom_text(aes(y = label_position, label = test_outcome, col = test_outcome), size=6, show.legend = F) + 
  scale_y_continuous(labels = percent) +
  labs(title = "Year: {frame_time}\n", x = NULL, y = "% of Bechdel test outcomes \n", subtitle = "") +
  transition_time(year) +
  ease_aes('linear') 
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.